1 Load the data

Load data from Nature 488, pp. 621-626: STAT.RData

load('STAT.RData')
library(phyloseq)
head(sample_data(phy))
## Sample Data:        [6 samples by 6 sample variables]:
##           X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1    cecal_C1              NA                   NA         C
## cecal_C10  cecal_C10              NA                   NA         C
## cecal_C2    cecal_C2              NA                   NA         C
## cecal_C3    cecal_C3              NA                   NA         C
## cecal_C4    cecal_C4              NA                   NA         C
## cecal_C5    cecal_C5              NA                   NA         C
##           Location         Description
## cecal_C1     cecal missing_description
## cecal_C10    cecal missing_description
## cecal_C2     cecal missing_description
## cecal_C3     cecal missing_description
## cecal_C4     cecal missing_description
## cecal_C5     cecal missing_description
levels(sample_data(phy)$Treatment)
## [1] "C"  "P"  "T"  "V"  "VP"
head(phenotypes)
##      GIP Insulin Leptin IGF1    BMD mFat pFat
## C1  8.61       1   2320 1250 0.0492  4.0 19.7
## C2 28.60     137   1040 2180 0.0487  3.7 18.6
## C3 19.20     543   1670 2160 0.0510  3.6 19.5
## C4 38.30     606    972 2070 0.0465  4.0 21.3
## C5 13.20       1   2210 1240 0.0547  4.1 21.6
## C6 36.80     696   2330 1830 0.0512  3.7 17.7
phy
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6547 taxa and 96 samples ]
## sample_data() Sample Data:       [ 96 samples by 6 sample variables ]
## tax_table()   Taxonomy Table:    [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
sample_names(phy)[1:5]
## [1] "cecal_C1"  "cecal_C10" "cecal_C2"  "cecal_C3"  "cecal_C4"

The sample names are in different formats: phenotypes are per mouse; phy data are per mouse-location.

 


2 Merge phenotypes

Merge phenotypes variable with the phyloseq data

pheno1 = phenotypes
pheno2 = phenotypes

rownames(pheno1) = paste('cecal', rownames(pheno1), sep="_")
rownames(pheno2) = paste('fecal', rownames(pheno2), sep="_")
pheno = rbind(pheno1, pheno2)

phy2 = merge_phyloseq(phy, sample_data(pheno))

phy
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6547 taxa and 96 samples ]
## sample_data() Sample Data:       [ 96 samples by 6 sample variables ]
## tax_table()   Taxonomy Table:    [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
phy2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6547 taxa and 96 samples ]
## sample_data() Sample Data:       [ 96 samples by 13 sample variables ]
## tax_table()   Taxonomy Table:    [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
head(sample_data(phy2))
## Sample Data:        [6 samples by 13 sample variables]:
##           X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1    cecal_C1              NA                   NA         C
## cecal_C10  cecal_C10              NA                   NA         C
## cecal_C2    cecal_C2              NA                   NA         C
## cecal_C3    cecal_C3              NA                   NA         C
## cecal_C4    cecal_C4              NA                   NA         C
## cecal_C5    cecal_C5              NA                   NA         C
##           Location         Description   GIP Insulin Leptin IGF1    BMD
## cecal_C1     cecal missing_description  8.61       1   2320 1250 0.0492
## cecal_C10    cecal missing_description 27.70     516   2680 3720 0.0479
## cecal_C2     cecal missing_description 28.60     137   1040 2180 0.0487
## cecal_C3     cecal missing_description 19.20     543   1670 2160 0.0510
## cecal_C4     cecal missing_description 38.30     606    972 2070 0.0465
## cecal_C5     cecal missing_description 13.20       1   2210 1240 0.0547
##           mFat pFat
## cecal_C1   4.0 19.7
## cecal_C10  4.0 19.8
## cecal_C2   3.7 18.6
## cecal_C3   3.6 19.5
## cecal_C4   4.0 21.3
## cecal_C5   4.1 21.6

 


3 Estimate richness

Estimate observed species with Chao1 and Shannon diversity using estimate_richness() function

?estimate_richness
alpha = estimate_richness(phy2, measures=c("Observed", "Chao1", "Shannon"))
head(alpha)
##           Observed    Chao1  se.chao1  Shannon
## cecal_C1       662 1650.429 148.60209 4.836219
## cecal_C10      592 1508.453 153.81056 4.814829
## cecal_C2       617 1216.394  91.48394 4.775428
## cecal_C3       731 1746.219 144.87205 4.918447
## cecal_C4       601 1452.068 137.36109 4.636482
## cecal_C5       698 1434.391 104.96195 4.944320
plot(ecdf(sample_sums(phy2)))

 


4 Rarefied diversities

Estimate rarified diversities in #3 averaged over 20 replicates and compare the two estimates

4.1 Let’s only keep samples that have at least 3500 observations/reads

phy3 = subset_samples(phy2, sample_sums(phy2)>3500)
alpha = estimate_richness(phy3, measures=c("Observed", "Chao1", "Shannon"))

4.2 Estimate rarefied diversities

alpha.rare = matrix(0, ncol=ncol(alpha), nrow=nrow(alpha))
for(i in 1:20){
  alpha.rare= alpha.rare + 
    estimate_richness(rarefy_even_depth(phy3, sample.size = 3500,
                                        verbose=F, trimOTUs = F), 
                      measures=c("Observed", "Chao1", "Shannon"))
}
alpha.rare = alpha.rare/20

4.3 Compare the rarefied vs non-rarefied alpha diversity

plot(alpha$Observed, alpha.rare$Observed)

plot(alpha$Chao1, alpha.rare$Chao1)

plot(alpha$Shannon, alpha.rare$Shannon)

 


5 Order-level abundances

5.1 Compute order level abundances

Summarize the data at the order level

order.phy = tax_glom(phy3, taxrank = "Order")

5.2 Normalize the data

Normalize the data using CLR, relative abundance, and DESeq2

5.2.1 relative abundance

order.rel = transform_sample_counts(order.phy, function(x) x/sum(x))

5.2.2 clr

order.clr = transform_sample_counts(order.phy, function(x){y=log(x+1); y/sum(y)})

5.2.3 DESeq2

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
## 
##     sampleNames
countData = round(as(otu_table(order.phy), "matrix"), digits = 0)
countData = countData + 1L
dds <- DESeqDataSetFromMatrix(countData, sample_data(order.phy), design = ~1)
## Warning in class(from) <- NULL: Setting class(x) to NULL; result will no
## longer be an S4 object
## converting counts to integer mode
order.deseq = merge_phyloseq(otu_table(counts(estimateSizeFactors(dds), 
                                              normalized=T), taxa_are_rows = T), 
                             sample_data(order.phy), 
                             phy_tree(order.phy), 
                             tax_table(order.phy))

 


6 Univariate tests

Compute appropriate univariate tests on normalized data

6.1 Compare location

Location = sample_data(order.rel)$Location
rel.p = apply(otu_table(order.rel),1, function(x) wilcox.test(c(x)~Location)$p.value)
## Warning in wilcox.test.default(x = structure(c(0.00614334470989761,
## 0.00181159420289855, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0.00068259385665529,
## 0.0072463768115942, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0.00068259385665529,
## 0.000452898550724638, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0,
## 0.000630119722747322, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0,
## 0.000630119722747322, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0.000452898550724638, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0.000452898550724638, :
## cannot compute exact p-value with ties
clr.p = apply(otu_table(order.clr),1, function(x) t.test(c(x)~Location)$p.value)
deseq.p = apply(otu_table(order.deseq),1, function(x) t.test(log10(c(x))~Location)$p.value)

univ.location.res = data.frame(taxa = tax_table(order.clr)[,"Order"], rel.p, clr.p, deseq.p)

6.2 do FDR

Adjust the p-values using False Discovery Rate

univ.location.res$rel.fdr = p.adjust(univ.location.res$rel.p, method="fdr")
univ.location.res$clr.fdr = p.adjust(univ.location.res$clr.p, method="fdr")
univ.location.res$deseq.fdr = p.adjust(univ.location.res$deseq.p, method="fdr")

colSums(univ.location.res<0.05)
## Warning in Ops.factor(left, right): '<' not meaningful for factors
##     Order     rel.p     clr.p   deseq.p   rel.fdr   clr.fdr deseq.fdr 
##        NA        10         8         9         9         7         7
univ.location.res
##                     Order        rel.p        clr.p      deseq.p
## 1948        Bacteroidales 1.453797e-11 9.983697e-01 4.902762e-01
## 1931 "Erysipelotrichales" 8.437764e-01 2.076959e-01 5.187488e-01
## 4911    Anaeroplasmatales 9.721986e-10 2.249341e-15 5.735335e-14
## 4220        Clostridiales 1.186926e-19 1.894648e-32 3.651509e-18
## 1026           Bacillales 8.032710e-08 1.301173e-06 4.190743e-06
## 1673    "Lactobacillales" 6.834879e-16 4.321846e-19 1.071339e-16
## 4789     Actinobacteridae 1.132115e-01 7.601720e-02 4.509191e-02
## 5995      Burkholderiales 7.245135e-02 8.371138e-02 1.021904e-01
## 5975         Neisseriales 3.437768e-01 3.224313e-01 1.034844e-01
## 5242     Sphingomonadales 7.253024e-02 9.877532e-02 2.072876e-01
## 1453      Rhodobacterales 3.437768e-01 3.224313e-01 1.066468e-01
## 455           Rhizobiales 3.118829e-01 3.227785e-01 8.148514e-01
## 6648         Myxococcales 3.118829e-01 3.227785e-01 8.165448e-01
## 3405    Enterobacteriales 2.819788e-02 3.464769e-02 3.530636e-02
## 2012       Pasteurellales 9.633972e-01 9.332040e-01 6.386468e-01
## 1847      Pseudomonadales 1.239806e-03 4.114033e-03 4.024734e-03
## 2172          Chloroplast 8.788168e-05 3.013895e-04 2.380779e-04
## 3806     Flavobacteriales 1.461646e-01 1.886147e-01 3.350350e-01
## 4609      Mycoplasmatales 5.427940e-01 2.501740e-01 2.868498e-01
## 3348   Desulfovibrionales 1.954624e-02 9.799033e-01 2.337175e-01
## 2626      Coriobacteridae 1.557592e-11 1.459745e-12 3.481561e-12
##           rel.fdr      clr.fdr    deseq.fdr
## 1948 8.177357e-11 9.983697e-01 6.052069e-01
## 1931 8.859653e-01 3.355087e-01 6.052069e-01
## 4911 4.083234e-09 1.574539e-14 4.014735e-13
## 4220 2.492544e-18 3.978760e-31 7.668168e-17
## 1026 2.811448e-07 5.464927e-06 1.760112e-05
## 1673 7.176623e-15 4.537938e-18 1.124905e-15
## 4789 1.828801e-01 1.757939e-01 1.052145e-01
## 5995 1.269279e-01 1.757939e-01 1.866319e-01
## 5975 4.010729e-01 3.765749e-01 1.866319e-01
## 5242 1.269279e-01 1.885711e-01 3.348492e-01
## 1453 4.010729e-01 3.765749e-01 1.866319e-01
## 455  4.010729e-01 3.765749e-01 8.165448e-01
## 6648 4.010729e-01 3.765749e-01 8.165448e-01
## 3405 5.921555e-02 9.095018e-02 9.267919e-02
## 2012 9.633972e-01 9.983697e-01 7.058727e-01
## 1847 3.254492e-03 1.234210e-02 1.207420e-02
## 2172 2.636450e-04 1.054863e-03 8.332725e-04
## 3806 2.192469e-01 3.300758e-01 4.397334e-01
## 4609 5.999302e-01 3.752611e-01 4.015897e-01
## 3348 4.560790e-02 9.983697e-01 3.505762e-01
## 2626 8.177357e-11 7.663660e-12 1.827820e-11

6.3 Plot abundances

Plot the abundances using plot_heatmap(), plot_bar(), or produce box and whisker plots

6.3.1 Plot heatmaps

library(ggplot2)
plot_heatmap(order.rel, taxa.label = "Order", sample.order = "Location")+
  facet_grid(.~Location, scales = "free_x")

plot_heatmap(order.clr, taxa.label = "Order", sample.order = "Location")+
  facet_grid(.~Location, scales = "free_x")

plot_heatmap(order.deseq, taxa.label = "Order", sample.order = "Location")+
  facet_grid(.~Location, scales = "free_x")

6.3.2 Plot barplots of all data

plot_bar(order.rel, fill="Location") + 
  facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")

plot_bar(order.clr, fill="Location") + 
  facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")

plot_bar(order.deseq, fill="Location") + 
  facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")

6.3.3 Plot box and whiskers charts

ggplot(psmelt(order.rel), aes(x=Location, y=Abundance)) + 
  geom_boxplot(aes(fill=Location)) + geom_jitter() + 
  facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y")

ggplot(psmelt(order.clr), aes(x=Location, y=Abundance)) + 
  geom_boxplot(aes(fill=Location)) + geom_jitter() + 
  facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y")

ggplot(psmelt(order.deseq), aes(x=Location, y=Abundance)) + 
  geom_boxplot(aes(fill=Location)) + geom_jitter() + 
  facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y") + scale_y_log10()

 


7 Testing in fecal samples

7.1 Subset to only fecal samples

Subset the order level data to only fecal samples

fecal.rel = subset_samples(order.rel, Location = "fecal")

7.2 Filter low abundance taxa

Filter out taxa with low abundance (mean abundance < 0.1%)

fecal.rel.subs = subset_taxa(fecal.rel, rowMeans(otu_table(fecal.rel))> 0.001)

7.3 Fecal samples vs Treatment

Perform univariate analysis of the fecal subset with respect to the Treatment variable

Treatment = sample_data(fecal.rel.subs)$Treatment

fecal.res = data.frame(taxa = tax_table(fecal.rel.subs)[,"Order"], 
                               fecal.p = apply(otu_table(fecal.rel.subs),1, 
                                               function(x) kruskal.test(c(x)~Treatment)$p.value))
fecal.res$fecal.fdr = p.adjust(fecal.res$fecal.p, method="fdr")

ggplot(psmelt(fecal.rel.subs), aes(x=Treatment, y=Abundance)) + 
  geom_boxplot(aes(fill=Treatment)) + geom_jitter() + 
  facet_wrap(facets="Order", ncol=4, nrow=2, scales="free_y")

7.4 Fecal samples vs BMD

fecal.deseq = subset_samples(order.deseq, Location = "fecal")

pheno.res = data.frame(taxa = tax_table(fecal.deseq)[,"Order"], 
                       BMD.p = apply(otu_table(fecal.deseq),1, 
                                       function(x) cor.test(log(c(x)), sample_data(fecal.deseq)$BMD)$p.value))
pheno.res$BMD.fdr = p.adjust(pheno.res$BMD.p, method="fdr")

pheno.res$pFAT.p = apply(otu_table(fecal.deseq),1, 
                         function(x) cor.test(log(c(x)), sample_data(fecal.deseq)$pFat)$p.value)

pheno.res$pFAT.fdr = p.adjust(pheno.res$pFAT.p, method="fdr")

7.4.1 Compute global FDR

gFDR = with(pheno.res, 
            matrix(p.adjust(c(BMD.p, pFAT.p), method="fdr"), 
                   ncol=2, nrow=nrow(pheno.res), byrow=F))
gFDR
##            [,1]      [,2]
##  [1,] 0.8628829 0.8628829
##  [2,] 0.8628829 0.3284212
##  [3,] 0.8628829 0.8628829
##  [4,] 0.8783478 0.8628829
##  [5,] 0.8628829 0.8628829
##  [6,] 0.9870451 0.9870451
##  [7,] 0.8628829 0.9051185
##  [8,] 0.9058371 0.8628829
##  [9,] 0.8628829 0.8628829
## [10,] 0.8628829 0.8628829
## [11,] 0.8628829 0.8628829
## [12,] 0.8628829 0.8628829
## [13,] 0.8628829 0.8828143
## [14,] 0.8628829 0.8628829
## [15,] 0.8628829 0.8628829
## [16,] 0.8628829 0.8628829
## [17,] 0.9247915 0.3284212
## [18,] 0.8628829 0.9870451
## [19,] 0.8628829 0.9247915
## [20,] 0.8628829 0.9910441
## [21,] 0.9576660 0.9576660

7.4.2 Plot phenotypic variables vs. “the most significant” result

ggplot(subset(psmelt(fecal.deseq), Order=="Enterobacteriales"), aes(x=BMD, y=Abundance)) +
  geom_point() + scale_y_log10()

ggplot(subset(psmelt(fecal.deseq), Order=="Enterobacteriales"), aes(x=pFat, y=Abundance)) + 
  geom_point() + scale_y_log10()

7.4.3 Plot phenotypic variables vs. Treatment

ggplot(sample_data(fecal.deseq), aes(x=Treatment, y=BMD)) + 
  geom_boxplot(aes(fill=Treatment))+geom_jitter()

ggplot(sample_data(fecal.deseq), aes(x=Treatment, y=pFat)) + 
  geom_boxplot(aes(fill=Treatment))+geom_jitter()